Importing dependencies¶

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

Loading Dataset¶

In [ ]:
df=pd.read_csv("SampleSuperstore.csv")
df
Out[ ]:
Ship Mode Segment Country City State Postal Code Region Category Sub-Category Sales Quantity Discount Profit
0 Second Class Consumer United States Henderson Kentucky 42420 South Furniture Bookcases 261.9600 2 0.00 41.9136
1 Second Class Consumer United States Henderson Kentucky 42420 South Furniture Chairs 731.9400 3 0.00 219.5820
2 Second Class Corporate United States Los Angeles California 90036 West Office Supplies Labels 14.6200 2 0.00 6.8714
3 Standard Class Consumer United States Fort Lauderdale Florida 33311 South Furniture Tables 957.5775 5 0.45 -383.0310
4 Standard Class Consumer United States Fort Lauderdale Florida 33311 South Office Supplies Storage 22.3680 2 0.20 2.5164
... ... ... ... ... ... ... ... ... ... ... ... ... ...
9989 Second Class Consumer United States Miami Florida 33180 South Furniture Furnishings 25.2480 3 0.20 4.1028
9990 Standard Class Consumer United States Costa Mesa California 92627 West Furniture Furnishings 91.9600 2 0.00 15.6332
9991 Standard Class Consumer United States Costa Mesa California 92627 West Technology Phones 258.5760 2 0.20 19.3932
9992 Standard Class Consumer United States Costa Mesa California 92627 West Office Supplies Paper 29.6000 4 0.00 13.3200
9993 Second Class Consumer United States Westminster California 92683 West Office Supplies Appliances 243.1600 2 0.00 72.9480

9994 rows × 13 columns

Observing Features¶

In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Ship Mode     9994 non-null   object 
 1   Segment       9994 non-null   object 
 2   Country       9994 non-null   object 
 3   City          9994 non-null   object 
 4   State         9994 non-null   object 
 5   Postal Code   9994 non-null   int64  
 6   Region        9994 non-null   object 
 7   Category      9994 non-null   object 
 8   Sub-Category  9994 non-null   object 
 9   Sales         9994 non-null   float64
 10  Quantity      9994 non-null   int64  
 11  Discount      9994 non-null   float64
 12  Profit        9994 non-null   float64
dtypes: float64(3), int64(2), object(8)
memory usage: 1015.1+ KB
In [ ]:
df.describe()
Out[ ]:
Postal Code Sales Quantity Discount Profit
count 9994.000000 9994.000000 9994.000000 9994.000000 9994.000000
mean 55190.379428 229.858001 3.789574 0.156203 28.656896
std 32063.693350 623.245101 2.225110 0.206452 234.260108
min 1040.000000 0.444000 1.000000 0.000000 -6599.978000
25% 23223.000000 17.280000 2.000000 0.000000 1.728750
50% 56430.500000 54.490000 3.000000 0.200000 8.666500
75% 90008.000000 209.940000 5.000000 0.200000 29.364000
max 99301.000000 22638.480000 14.000000 0.800000 8399.976000
In [ ]:
df.isnull().sum()
Out[ ]:
Ship Mode       0
Segment         0
Country         0
City            0
State           0
Postal Code     0
Region          0
Category        0
Sub-Category    0
Sales           0
Quantity        0
Discount        0
Profit          0
dtype: int64

Preprocessing¶

In [ ]:
df.nunique()
Out[ ]:
Ship Mode          4
Segment            3
Country            1
City             531
State             49
Postal Code      631
Region             4
Category           3
Sub-Category      17
Sales           5825
Quantity          14
Discount          12
Profit          7287
dtype: int64
In [ ]:
df['Country'].unique()
Out[ ]:
array(['United States'], dtype=object)

The whole dataset consists of the retail datas only in US

In [ ]:
#droping Country because only US
df.drop("Country",axis=1,inplace=True)
df
Out[ ]:
Ship Mode Segment City State Postal Code Region Category Sub-Category Sales Quantity Discount Profit
0 Second Class Consumer Henderson Kentucky 42420 South Furniture Bookcases 261.9600 2 0.00 41.9136
1 Second Class Consumer Henderson Kentucky 42420 South Furniture Chairs 731.9400 3 0.00 219.5820
2 Second Class Corporate Los Angeles California 90036 West Office Supplies Labels 14.6200 2 0.00 6.8714
3 Standard Class Consumer Fort Lauderdale Florida 33311 South Furniture Tables 957.5775 5 0.45 -383.0310
4 Standard Class Consumer Fort Lauderdale Florida 33311 South Office Supplies Storage 22.3680 2 0.20 2.5164
... ... ... ... ... ... ... ... ... ... ... ... ...
9989 Second Class Consumer Miami Florida 33180 South Furniture Furnishings 25.2480 3 0.20 4.1028
9990 Standard Class Consumer Costa Mesa California 92627 West Furniture Furnishings 91.9600 2 0.00 15.6332
9991 Standard Class Consumer Costa Mesa California 92627 West Technology Phones 258.5760 2 0.20 19.3932
9992 Standard Class Consumer Costa Mesa California 92627 West Office Supplies Paper 29.6000 4 0.00 13.3200
9993 Second Class Consumer Westminster California 92683 West Office Supplies Appliances 243.1600 2 0.00 72.9480

9994 rows × 12 columns

Droping duplicates¶

In [ ]:
df[df.duplicated()]
Out[ ]:
Ship Mode Segment City State Postal Code Region Category Sub-Category Sales Quantity Discount Profit
950 Standard Class Home Office Philadelphia Pennsylvania 19120 East Office Supplies Paper 15.552 3 0.2 5.4432
3406 Standard Class Home Office Columbus Ohio 43229 East Furniture Chairs 281.372 2 0.3 -12.0588
3670 Standard Class Consumer Salem Oregon 97301 West Office Supplies Paper 10.368 2 0.2 3.6288
4117 Standard Class Consumer Los Angeles California 90036 West Office Supplies Paper 19.440 3 0.0 9.3312
4553 Standard Class Consumer San Francisco California 94122 West Office Supplies Paper 12.840 3 0.0 5.7780
5905 Same Day Home Office San Francisco California 94122 West Office Supplies Labels 41.400 4 0.0 19.8720
6146 Standard Class Corporate San Francisco California 94122 West Office Supplies Art 11.760 4 0.0 3.1752
6334 Standard Class Consumer New York City New York 10011 East Office Supplies Paper 49.120 4 0.0 23.0864
6357 Standard Class Corporate Seattle Washington 98103 West Office Supplies Paper 25.920 4 0.0 12.4416
7608 Standard Class Consumer San Francisco California 94122 West Office Supplies Paper 25.920 4 0.0 12.4416
7735 Standard Class Corporate Seattle Washington 98105 West Office Supplies Paper 19.440 3 0.0 9.3312
7759 Standard Class Corporate Houston Texas 77041 Central Office Supplies Paper 15.552 3 0.2 5.4432
8032 First Class Consumer Houston Texas 77041 Central Office Supplies Paper 47.952 3 0.2 16.1838
8095 Second Class Consumer Seattle Washington 98115 West Office Supplies Paper 12.960 2 0.0 6.2208
9262 Standard Class Consumer Detroit Michigan 48227 Central Furniture Chairs 389.970 3 0.0 35.0973
9363 Standard Class Home Office Seattle Washington 98105 West Furniture Furnishings 22.140 3 0.0 6.4206
9477 Second Class Corporate Chicago Illinois 60653 Central Office Supplies Binders 3.564 3 0.8 -6.2370
In [ ]:
df.drop_duplicates(inplace = True)

Analysing the profit column¶

In [ ]:
df['Profit'].describe()
Out[ ]:
count    9977.00000
mean       28.69013
std       234.45784
min     -6599.97800
25%         1.72620
50%         8.67100
75%        29.37200
max      8399.97600
Name: Profit, dtype: float64
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the figure size
plt.figure(figsize=(10, 6))

# Create a violin plot for the "Profit" column
sns.violinplot(x='Profit', data=df, palette='viridis')

# Set the title and labels
plt.title('Violin Plot of Profit', fontsize=14)
plt.xlabel('Profit', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.xlim(-600,600)
# Show the plot
plt.tight_layout()
plt.show()
In [ ]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Profit', data=df, palette='Set3')
plt.title('Box Plot of Profit', fontsize=14)
plt.xlabel('Profit', fontsize=12)
plt.xlim(-600,600)
plt.show()
In [ ]:
import matplotlib.pyplot as plt
import scipy.stats as stats

# Create a histogram of the "Profit" column with multiple bins
plt.figure(figsize=(12, 6))

# Set the x-axis limits (minimum and maximum values)
x_min = -500  # Adjust this to your desired minimum value
x_max = 500   # Adjust this to your desired maximum value
plt.hist(df['Profit'], bins=50, color='skyblue', edgecolor='black', alpha=0.7, range=(x_min, x_max))

plt.xlabel('Profit', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Profit', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.6)

# Calculate the mean and standard deviation of the "Profit" column
mean_Profit = df['Profit'].mean()
std_Profit = df['Profit'].std()

# Calculate the z-score for each data point
z_scores = (df['Profit'] - mean_Profit) / std_Profit

# Calculate the 25th, 50th (median), and 75th percentiles for z-scores
percentiles = [stats.percentileofscore(z_scores, x) for x in [-1, 0, 1]]

# Add vertical lines for mean, median, and quartiles
plt.axvline(df['Profit'].mean(), color='red', linestyle='dashed', linewidth=2, label='Mean')
plt.axvline(df['Profit'].median(), color='green', linestyle='dashed', linewidth=2, label='Median')
plt.axvline(df['Profit'].quantile(0.25), color='purple', linestyle='dashed', linewidth=2, label='25th Percentile')
plt.axvline(df['Profit'].quantile(0.75), color='orange', linestyle='dashed', linewidth=2, label='75th Percentile')

# Add a legend
plt.legend()

# Set the x-axis limits
plt.xlim(x_min, x_max)

# Annotate the quantity above the bars
for i, count in enumerate(plt.hist(df['Profit'], bins=50, range=(x_min, x_max))[0]):
    plt.text(
        plt.hist(df['Profit'], bins=50, range=(x_min, x_max))[1][i] + 2,  # X-coordinate
        count + 10,  # Y-coordinate
        f'{int(count)}',  # Text to display (rounded to the nearest integer)
        ha='center', va='bottom', fontsize=10
    )

# Show the plot
plt.tight_layout()
plt.show()
In [ ]:
# Create a function to label profit and loss categories
def label_profit_loss_Profit_Label(profit):
    if profit > 2500:
        return 'Very High Profit (> 2500)'
    elif profit > 1500:
        return 'High Profit (> 1500)'
    elif profit > 500:
        return 'Medium Profit (> 500)'
    elif profit > 0:
        return 'Low Profit (profit > 0)'
    elif profit > -100:
        return 'Medium Loss (< 100)'
    elif profit > -500:
        return 'Medium Loss (< 500)'   
    elif profit < -500:
        return 'Heavy Loss ( > 500)'
    else:
        return 'nan'

# Apply the function to create the new column 'Profit_Label'
df['Profit_Label'] = df['Profit'].apply(label_profit_loss_Profit_Label)


# Display the first few rows of the DataFrame to verify the new column
print(df[['Profit_Label']].value_counts())
Profit_Label             
Low Profit (profit > 0)      7926
Medium Loss (< 100)          1623
Medium Loss (< 500)           261
Medium Profit (> 500)          97
Heavy Loss ( > 500)            50
High Profit (> 1500)           10
Very High Profit (> 2500)      10
dtype: int64
In [ ]:
import matplotlib.pyplot as plt

# Calculate value counts of the 'Profit_Label' column
value_counts = df['Profit_Label'].value_counts()

# Create a bar plot
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
ax = value_counts.plot(kind='bar', color='skyblue')

# Add labels and title
plt.xlabel('Profit Label')
plt.ylabel('Count')
plt.title('Distribution of Profit Labels')
plt.xticks(rotation=40, fontsize=10)

# Annotate the bars with their respective counts
for i, count in enumerate(value_counts):
    plt.text(i, count + 50, str(count), ha='center', va='bottom', fontsize=12)

# Show the plot
plt.show()

Seperating Columns¶

In [ ]:
obj_cols=df.select_dtypes(include='object').columns.tolist()
num_cols=df.select_dtypes(include='number').columns.tolist()
cat_cols=df.select_dtypes(include='category').columns.tolist()

OBJECT COLUMNS¶

In [ ]:
obj_cols
Out[ ]:
['Ship Mode',
 'Segment',
 'City',
 'State',
 'Region',
 'Category',
 'Sub-Category',
 'Profit_Label']
In [ ]:
for i in obj_cols:
    print(df[i].nunique() ,i)
4 Ship Mode
3 Segment
531 City
49 State
4 Region
3 Category
17 Sub-Category
7 Profit_Label
In [ ]:
for i in obj_cols:
    print(i,"\n\n",df[i].value_counts())
Ship Mode 

 Standard Class    5955
Second Class      1943
First Class       1537
Same Day           542
Name: Ship Mode, dtype: int64
Segment 

 Consumer       5183
Corporate      3015
Home Office    1779
Name: Segment, dtype: int64
City 

 New York City      914
Los Angeles        746
Philadelphia       536
San Francisco      506
Seattle            424
                  ... 
Glenview             1
Missouri City        1
Rochester Hills      1
Palatine             1
Manhattan            1
Name: City, Length: 531, dtype: int64
State 

 California              1996
New York                1127
Texas                    983
Pennsylvania             586
Washington               502
Illinois                 491
Ohio                     468
Florida                  383
Michigan                 254
North Carolina           249
Arizona                  224
Virginia                 224
Georgia                  184
Tennessee                183
Colorado                 182
Indiana                  149
Kentucky                 139
Massachusetts            135
New Jersey               130
Oregon                   123
Wisconsin                110
Maryland                 105
Delaware                  96
Minnesota                 89
Connecticut               82
Oklahoma                  66
Missouri                  66
Alabama                   61
Arkansas                  60
Rhode Island              56
Utah                      53
Mississippi               53
Louisiana                 42
South Carolina            42
Nevada                    39
Nebraska                  38
New Mexico                37
Iowa                      30
New Hampshire             27
Kansas                    24
Idaho                     21
Montana                   15
South Dakota              12
Vermont                   11
District of Columbia      10
Maine                      8
North Dakota               7
West Virginia              4
Wyoming                    1
Name: State, dtype: int64
Region 

 West       3193
East       2845
Central    2319
South      1620
Name: Region, dtype: int64
Category 

 Office Supplies    6012
Furniture          2118
Technology         1847
Name: Category, dtype: int64
Sub-Category 

 Binders        1522
Paper          1359
Furnishings     956
Phones          889
Storage         846
Art             795
Accessories     775
Chairs          615
Appliances      466
Labels          363
Tables          319
Envelopes       254
Bookcases       228
Fasteners       217
Supplies        190
Machines        115
Copiers          68
Name: Sub-Category, dtype: int64
Profit_Label 

 Low Profit (profit > 0)      7926
Medium Loss (< 100)          1623
Medium Loss (< 500)           261
Medium Profit (> 500)          97
Heavy Loss ( > 500)            50
High Profit (> 1500)           10
Very High Profit (> 2500)      10
Name: Profit_Label, dtype: int64

Insight plots¶

In [ ]:
fig, axes = plt.subplots(len(obj_cols), 2, figsize=(12, 6*len(obj_cols)))
plt.subplots_adjust(hspace=0.5)

for i, col in enumerate(obj_cols):
    # Plot the bar plot
    col_counts = df[col].value_counts()
    axes[i, 0].bar(col_counts.index, col_counts.values, edgecolor='black', alpha=0.7)
    axes[i, 0].set_xlabel(col, fontsize=12)
    axes[i, 0].set_ylabel('Frequency', fontsize=12)
    axes[i, 0].set_title(f'Distribution of {col} (Bar Plot)', fontsize=14)
    axes[i, 0].tick_params(axis='x', rotation=45, labelsize=10)
    for j, v in enumerate(col_counts.values):
        axes[i, 0].text(j, v + 10, str(v), ha='center', va='bottom', fontsize=10)
    axes[i, 0].spines['top'].set_visible(False)
    axes[i, 0].spines['right'].set_visible(False)

    # Plot the pie chart
    col_counts.plot(kind="pie", autopct='%1.1f%%', ax=axes[i, 1])
    axes[i, 1].set_title(f'Distribution of {col} (Pie Chart)', fontsize=14)
    axes[i, 1].set_ylabel("")  # Remove the default ylabel

# Show the plots
plt.tight_layout()
plt.show()
In [ ]:
import matplotlib.pyplot as plt

# Create subplots for each column
fig, axes = plt.subplots(len(obj_cols), 1, figsize=(12, 6*len(obj_cols)))
plt.subplots_adjust(hspace=0.5)

for i, col in enumerate(obj_cols):
    # Calculate counts for each category in the column for both "Profit_Label"
    col_counts = df[col].value_counts()
    profit_labels = df['Profit_Label'].unique()

    # Initialize a dictionary to store counts for each "Profit_Label"
    counts_by_profit_label = {label: [] for label in profit_labels}

    for label in profit_labels:
        # Get the counts for the current "Profit_Label"
        label_counts = df[df['Profit_Label'] == label][col].value_counts()
        categories = col_counts.index

        for category in categories:
            # Get the count for the current category and "Profit_Label"
            count = label_counts.get(category, 0)

            # Append the count to the respective list
            counts_by_profit_label[label].append(count)

    # Create a stacked bar plot for each "Profit_Label"
    for label in profit_labels:
        axes[i].bar(categories, counts_by_profit_label[label], label=label)

        # Annotate each bar with the count value
        for j, count in enumerate(counts_by_profit_label[label]):
            axes[i].annotate(count, (j, counts_by_profit_label[label][j] + 5), ha='center', fontsize=8)

    axes[i].set_xlabel(col, fontsize=12)
    axes[i].set_ylabel('Frequency', fontsize=12)
    axes[i].set_title(f'Distribution of {col} by Profit_Label (Stacked Bar Plot)', fontsize=14)
    axes[i].tick_params(axis='x', rotation=45, labelsize=10)
    axes[i].spines['top'].set_visible(False)
    axes[i].spines['right'].set_visible(False)
    axes[i].legend()

# Show the plots
plt.tight_layout()
plt.show()
In [ ]:
import matplotlib.pyplot as plt

# Create subplots for each column
fig, axes = plt.subplots(len(obj_cols), 1, figsize=(12, 6*len(obj_cols)))
plt.subplots_adjust(hspace=0.5)

for i, col in enumerate(obj_cols):
    # Get unique categories in the column
    categories = df[col].unique()
    
    # Initialize a dictionary to store counts for each category by "Profit_Label"
    counts_by_profit_label = {label: [] for label in df['Profit_Label'].unique()}
    
    for label in df['Profit_Label'].unique():
        # Get the counts for each category by "Profit_Label"
        category_counts = df[df['Profit_Label'] == label][col].value_counts()
        
        # Append the counts to the respective list
        counts_by_profit_label[label] = [category_counts.get(cat, 0) for cat in categories]
    
    # Create a stacked bar plot
    bottom = [0] * len(categories)
    
    for label in df['Profit_Label'].unique():
        bars = axes[i].bar(categories, counts_by_profit_label[label], label=label, bottom=bottom)
        
        for bar, count in zip(bars, counts_by_profit_label[label]):
            if count > 0:
                axes[i].annotate(str(count), xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), 
                                 xytext=(0, 3), textcoords='offset points', ha='center', va='bottom')

        bottom = [prev + curr for prev, curr in zip(bottom, counts_by_profit_label[label])]
    
    axes[i].set_xlabel(col, fontsize=12)
    axes[i].set_ylabel('Frequency', fontsize=12)
    axes[i].set_title(f'Distribution of {col} by Profit_Label (Stacked Bar Plot)', fontsize=14)
    axes[i].tick_params(axis='x', rotation=45, labelsize=10)
    axes[i].spines['top'].set_visible(False)
    axes[i].spines['right'].set_visible(False)
    axes[i].legend()

# Show the plots
plt.tight_layout()
plt.show()
In [ ]:
import matplotlib.pyplot as plt

# Define the columns of interest
columns_of_interest = ['City', 'State']

# Create subplots for the columns of interest
fig, axes = plt.subplots(len(columns_of_interest), 2, figsize=(12, 6*len(columns_of_interest)))
plt.subplots_adjust(hspace=0.5)

for i, col in enumerate(columns_of_interest):
    # Calculate counts for each category in the column for both "Profit_Label"
    col_counts = df[col].value_counts()
    
    # Get the top 5 and bottom 5 categories
    top5 = col_counts.head(5)
    bottom5 = col_counts.tail(5)
    
    # Combine the top 5 and bottom 5 categories
    categories = top5.index.tolist() + bottom5.index.tolist()
    
    # Get the counts for the combined categories
    combined_counts = [col_counts.get(category, 0) for category in categories]
    
    # Create a bar plot
    axes[i, 0].bar(categories, combined_counts)
    axes[i, 0].set_xlabel(col, fontsize=12)
    axes[i, 0].set_ylabel('Frequency', fontsize=12)
    axes[i, 0].set_title(f'Distribution of {col} (Bar Plot)', fontsize=14)
    axes[i, 0].tick_params(axis='x', rotation=45, labelsize=10)
    axes[i, 0].spines['top'].set_visible(False)
    axes[i, 0].spines['right'].set_visible(False)
    
    # Add annotations to the bar plot
    for j, count in enumerate(combined_counts):
        axes[i, 0].annotate(str(count), (categories[j], count), ha='center', va='bottom', fontsize=10)
    
    # Create a pie chart
    axes[i, 1].pie(combined_counts, labels=categories, autopct='%1.1f%%', startangle=90)
    axes[i, 1].set_title(f'Distribution of {col} (Pie Chart)', fontsize=14)

# Show the plots
plt.tight_layout()
plt.show()
In [ ]:
sns.heatmap(df[['Sales','Quantity',"Discount","Profit"]].corr(),annot=True,cmap="Reds")
Out[ ]:
<Axes: >

SUM¶

pattern

positive
sales quantity Profit
quantity Sales
Profit Sales

negative
disc Sales

In [ ]:
df.groupby(by=['Segment']).sum(numeric_only=True).sort_values("Profit") #insight
Out[ ]:
Postal Code Sales Quantity Discount Profit
Segment
Home Office 97903137 4.292927e+05 6732 261.83 60279.0015
Corporate 164108306 7.060701e+05 11591 476.85 91954.9798
Consumer 288269634 1.160833e+06 19497 820.51 134007.4413
In [ ]:
df.groupby(by=['Ship Mode']).sum(numeric_only=True)
Out[ ]:
Postal Code Sales Quantity Discount Profit
Ship Mode
First Class 84152470 3.513805e+05 5690 252.97 48953.6561
Same Day 31147971 1.283217e+05 1956 82.75 15871.8869
Second Class 108033820 4.591770e+05 7418 269.35 57446.6516
Standard Class 326946816 1.357316e+06 22756 954.12 163969.2280
In [ ]:
 
In [ ]:
 
In [ ]:
 

Numeric Columns¶

In [ ]:
df[num_cols].describe()
Out[ ]:
Postal Code Sales Quantity Discount Profit
count 9977.000000 9977.000000 9977.000000 9977.000000 9977.00000
mean 55154.964117 230.148902 3.790719 0.156278 28.69013
std 32058.266816 623.721409 2.226657 0.206455 234.45784
min 1040.000000 0.444000 1.000000 0.000000 -6599.97800
25% 23223.000000 17.300000 2.000000 0.000000 1.72620
50% 55901.000000 54.816000 3.000000 0.200000 8.67100
75% 90008.000000 209.970000 5.000000 0.200000 29.37200
max 99301.000000 22638.480000 14.000000 0.800000 8399.97600
In [ ]:
df['Sales'].nunique()
Out[ ]:
5825
In [ ]:
df[num_cols].sort_values("Sales")
Out[ ]:
Postal Code Sales Quantity Discount Profit
4101 77095 0.444 1 0.8 -1.1100
9292 76706 0.556 1 0.8 -0.9452
8658 60623 0.836 1 0.8 -1.3376
4711 19120 0.852 1 0.7 -0.5964
2106 75150 0.876 1 0.8 -1.4016
... ... ... ... ... ...
4190 19711 10499.970 3 0.0 5039.9856
2623 10024 11199.968 4 0.2 3919.9888
8153 98115 13999.960 4 0.0 6719.9808
6826 47905 17499.950 5 0.0 8399.9760
2697 32216 22638.480 6 0.5 -1811.0784

9977 rows × 5 columns

In [ ]:
sns.displot(x=df['Profit'],y=df['Sales'])
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x212a1c9e290>
In [ ]:
sns.pairplot(df[num_cols],height=3)
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x21294167e10>
In [ ]:
df.groupby(by=['']).agg([sum,max,describe,min,count,...])
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
c:\Users\Irshad Yasseen A\Documents\Jupy\Untitled Folder\intern\Sparks\task2\Retail_EDA.ipynb Cell 48 line 1
----> <a href='vscode-notebook-cell:/c%3A/Users/Irshad%20Yasseen%20A/Documents/Jupy/Untitled%20Folder/intern/Sparks/task2/Retail_EDA.ipynb#X63sZmlsZQ%3D%3D?line=0'>1</a> df.groupby(by=['']).agg([sum,max,describe,min,count,...])

File c:\Users\Irshad Yasseen A\anaconda3\Lib\site-packages\pandas\core\frame.py:8402, in DataFrame.groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, observed, dropna)
   8399     raise TypeError("You have to supply one of 'by' and 'level'")
   8400 axis = self._get_axis_number(axis)
-> 8402 return DataFrameGroupBy(
   8403     obj=self,
   8404     keys=by,
   8405     axis=axis,
   8406     level=level,
   8407     as_index=as_index,
   8408     sort=sort,
   8409     group_keys=group_keys,
   8410     squeeze=squeeze,
   8411     observed=observed,
   8412     dropna=dropna,
   8413 )

File c:\Users\Irshad Yasseen A\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:965, in GroupBy.__init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, mutated, dropna)
    962 if grouper is None:
    963     from pandas.core.groupby.grouper import get_grouper
--> 965     grouper, exclusions, obj = get_grouper(
    966         obj,
    967         keys,
    968         axis=axis,
    969         level=level,
    970         sort=sort,
    971         observed=observed,
    972         mutated=self.mutated,
    973         dropna=self.dropna,
    974     )
    976 self.obj = obj
    977 self.axis = obj._get_axis_number(axis)

File c:\Users\Irshad Yasseen A\anaconda3\Lib\site-packages\pandas\core\groupby\grouper.py:888, in get_grouper(obj, key, axis, level, sort, observed, mutated, validate, dropna)
    886         in_axis, level, gpr = False, gpr, None
    887     else:
--> 888         raise KeyError(gpr)
    889 elif isinstance(gpr, Grouper) and gpr.key is not None:
    890     # Add key to exclusions
    891     exclusions.add(gpr.key)

KeyError: ''
In [ ]:
def print_alpha_nums(abc_list, num_list):
    for char in abc_list:
        for num in num_list:
            print(char, num)
    return

print_alpha_nums(['a', 'b', 'c'], [1, 2, 3])
a 1
a 2
a 3
b 1
b 2
b 3
c 1
c 2
c 3
In [ ]:
import pandas as pd

# Assuming your dataset is stored in a DataFrame called 'df'

# Total Sales and Profit
total_Sales = df['Sales'].sum()
total_Profit = df['Profit'].sum()

# Average Sales and Profit
average_Sales = df['Sales'].mean()
average_Profit = df['Profit'].mean()

# Sales by Category
Sales_by_Profit_Label = df.groupby('Profit_Label')['Sales'].sum()

# Sales by SubProfit_Label
Sales_by_subProfit_Label = df.groupby('sub Profit_Label')['Sales'].sum()

# Total Quantity Sold
total_quantity_sold = df['quantity'].sum()

# Average Quantity Sold
average_quantity_sold = df['quantity'].mean()

# Total Discount Given
total_discount_given = df['discount'].sum()

# Average Discount
average_discount = df['discount'].mean()

# Sales by Region
Sales_by_region = df.groupby('region')['Sales'].sum()

# Profit by City
Profit_by_city = df.groupby('city')['Profit'].sum()

# Customer Segmentation
customer_segmentation = df.groupby(['segment', 'country'])['Sales'].sum()

# Shipping Mode Insights
shipping_mode_counts = df['ship mode'].value_counts()

# You can print or further analyze these variables as needed.
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
c:\Users\Irshad Yasseen A\Documents\Jupy\Untitled Folder\intern\Sparks\task2\Retail_EDA.ipynb Cell 48 line 1
     <a href='vscode-notebook-cell:/c%3A/Users/Irshad%20Yasseen%20A/Documents/Jupy/Untitled%20Folder/intern/Sparks/task2/Retail_EDA.ipynb#X52sZmlsZQ%3D%3D?line=13'>14</a> Sales_by_Profit_Label = df.groupby('Profit_Label')['Sales'].sum()
     <a href='vscode-notebook-cell:/c%3A/Users/Irshad%20Yasseen%20A/Documents/Jupy/Untitled%20Folder/intern/Sparks/task2/Retail_EDA.ipynb#X52sZmlsZQ%3D%3D?line=15'>16</a> # Sales by SubProfit_Label
---> <a href='vscode-notebook-cell:/c%3A/Users/Irshad%20Yasseen%20A/Documents/Jupy/Untitled%20Folder/intern/Sparks/task2/Retail_EDA.ipynb#X52sZmlsZQ%3D%3D?line=16'>17</a> Sales_by_subProfit_Label = df.groupby('sub Profit_Label')['Sales'].sum()
     <a href='vscode-notebook-cell:/c%3A/Users/Irshad%20Yasseen%20A/Documents/Jupy/Untitled%20Folder/intern/Sparks/task2/Retail_EDA.ipynb#X52sZmlsZQ%3D%3D?line=18'>19</a> # Total Quantity Sold
     <a href='vscode-notebook-cell:/c%3A/Users/Irshad%20Yasseen%20A/Documents/Jupy/Untitled%20Folder/intern/Sparks/task2/Retail_EDA.ipynb#X52sZmlsZQ%3D%3D?line=19'>20</a> total_quantity_sold = df['quantity'].sum()

File c:\Users\Irshad Yasseen A\anaconda3\Lib\site-packages\pandas\core\frame.py:8402, in DataFrame.groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, observed, dropna)
   8399     raise TypeError("You have to supply one of 'by' and 'level'")
   8400 axis = self._get_axis_number(axis)
-> 8402 return DataFrameGroupBy(
   8403     obj=self,
   8404     keys=by,
   8405     axis=axis,
   8406     level=level,
   8407     as_index=as_index,
   8408     sort=sort,
   8409     group_keys=group_keys,
   8410     squeeze=squeeze,
   8411     observed=observed,
   8412     dropna=dropna,
   8413 )

File c:\Users\Irshad Yasseen A\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:965, in GroupBy.__init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, mutated, dropna)
    962 if grouper is None:
    963     from pandas.core.groupby.grouper import get_grouper
--> 965     grouper, exclusions, obj = get_grouper(
    966         obj,
    967         keys,
    968         axis=axis,
    969         level=level,
    970         sort=sort,
    971         observed=observed,
    972         mutated=self.mutated,
    973         dropna=self.dropna,
    974     )
    976 self.obj = obj
    977 self.axis = obj._get_axis_number(axis)

File c:\Users\Irshad Yasseen A\anaconda3\Lib\site-packages\pandas\core\groupby\grouper.py:888, in get_grouper(obj, key, axis, level, sort, observed, mutated, validate, dropna)
    886         in_axis, level, gpr = False, gpr, None
    887     else:
--> 888         raise KeyError(gpr)
    889 elif isinstance(gpr, Grouper) and gpr.key is not None:
    890     # Add key to exclusions
    891     exclusions.add(gpr.key)

KeyError: 'sub Profit_Label'
In [ ]:
import matplotlib.pyplot as plt

# Define the selected cities you want to plot
selected_cities = ['New York City', 'Los Angeles', 'Chicago', 'Houston', 'Philadelphia']

# Filter the DataFrame to include only the selected cities
filtered_df = df[df['City'].isin(selected_cities)]

# Calculate the total profit for each selected city
city_profits = filtered_df.groupby('City')['Profit'].sum()

# Create a bar plot
plt.figure(figsize=(10, 6))
plt.bar(city_profits.index, city_profits.values, color='skyblue')
plt.xlabel('City', fontsize=12)
plt.ylabel('Total Profit', fontsize=12)
plt.title('Total Profit by Selected Cities', fontsize=14)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Show the plot
plt.show()
In [ ]:
import matplotlib.pyplot as plt

# Calculate the total profit for each city
city_profits = df.groupby('City')['Profit'].sum()

# Sort the cities by profit in descending order
sorted_cities = city_profits.sort_values(ascending=False)

# Select the top 10 and bottom 10 cities
top_10_cities = sorted_cities.head(10)
bottom_10_cities = sorted_cities.tail(10)

# Combine the top 10 and bottom 10 cities
selected_cities = top_10_cities.index.tolist() + bottom_10_cities.index.tolist()

# Filter the DataFrame to include only the selected cities
filtered_df = df[df['City'].isin(selected_cities)]

# Calculate the total profit for each selected city
city_profits = filtered_df.groupby('City')['Profit'].sum()

# Create a bar plot
plt.figure(figsize=(12, 6))
bars = plt.bar(city_profits.index, city_profits.values, color='skyblue')
plt.xlabel('City', fontsize=12)
plt.ylabel('Total Profit', fontsize=12)
plt.title('Total Profit by Top 10 and Bottom 10 Cities', fontsize=14)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Annotate the bars with profit values
for bar in bars:
    height = bar.get_height()
    plt.annotate(f'{height:.2f}', xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3), textcoords='offset points', ha='center', va='bottom', fontsize=10)

# Show the plot
plt.show()
In [ ]:
import matplotlib.pyplot as plt

# Filter the DataFrame to include only cities with profit
profit_cities = df[df['Profit'] > 0]

# Filter the DataFrame to include only cities with loss
loss_cities = df[df['Profit'] < 0]

# Sort the profit cities by profit in descending order and select the top 10
top_10_profit_cities = profit_cities.sort_values(by='Profit', ascending=False).head(10)

# Sort the loss cities by profit in ascending order and select the top 10
top_10_loss_cities = loss_cities.sort_values(by='Profit', ascending=True).head(10)

# Concatenate the top 10 profit and top 10 loss cities
top_10_cities_combined = pd.concat([top_10_profit_cities, top_10_loss_cities])

# Create a bar plot
plt.figure(figsize=(12, 6))
bars = plt.bar(top_10_cities_combined['City'], top_10_cities_combined['Profit'], color=['skyblue' if profit > 0 else 'lightcoral' for profit in top_10_cities_combined['Profit']])
plt.xlabel('City', fontsize=12)
plt.ylabel('Profit/Loss', fontsize=12)
plt.title('Top 10 Cities by Profit and Loss', fontsize=14)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Annotate the bars with profit/loss values
for bar in bars:
    height = bar.get_height()
    plt.annotate(f'{height:.2f}', xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3), textcoords='offset points', ha='center', va='bottom', fontsize=10)

# Show the plot
plt.show()
In [ ]:
import matplotlib.pyplot as plt

# Filter the DataFrame to include only cities with profit
profit_cities = df[df['Profit'] > 0]

# Filter the DataFrame to include only cities with loss
loss_cities = df[df['Profit'] < 0]

# Sort the profit cities by profit in descending order and select the top 10
top_10_profit_cities = profit_cities.sort_values(by='Profit', ascending=False).head(10)

# Sort the loss cities by profit in ascending order and select the top 10
top_10_loss_cities = loss_cities.sort_values(by='Profit', ascending=True).head(10)

# Create two separate subplots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

# Plot the top 10 cities with profit
ax1.bar(top_10_profit_cities['City'], top_10_profit_cities['Profit'], color='skyblue')
ax1.set_xlabel('City', fontsize=12)
ax1.set_ylabel('Profit', fontsize=12)
ax1.set_title('Top 10 Cities by Profit', fontsize=14)
ax1.tick_params(axis='x', rotation=45, labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)

# Plot the top 10 cities with loss
ax2.bar(top_10_loss_cities['City'], top_10_loss_cities['Profit'], color='lightcoral')
ax2.set_xlabel('City', fontsize=12)
ax2.set_ylabel('Loss', fontsize=12)
ax2.set_title('Top 10 Cities by Loss', fontsize=14)
ax2.tick_params(axis='x', rotation=45, labelsize=10)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Adjust spacing between subplots
plt.tight_layout()

# Show the plots
plt.show()
In [ ]:
import matplotlib.pyplot as plt

# Calculate the total profit for each city
city_profits = df.groupby('City')['Profit'].sum()

# Sort the cities by profit in descending order and select the top 10
top_10_cities = city_profits.sort_values(ascending=False).head(10)

# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(top_10_cities.values, labels=top_10_cities.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Profit Among Leading Cities', fontsize=14)

# Show the plot
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
In [ ]:
 
In [ ]:
df[df['City']=='Lafayette']
Out[ ]:
Ship Mode Segment City State Postal Code Region Category Sub-Category Sales Quantity Discount Profit Profit_Label
1627 Standard Class Consumer Lafayette Louisiana 70506 South Furniture Chairs 517.50 6 0.0 155.2500 Low Profit (profit > 0)
1932 Second Class Consumer Lafayette Louisiana 70506 South Furniture Bookcases 145.74 3 0.0 23.3184 Low Profit (profit > 0)
1933 Second Class Consumer Lafayette Louisiana 70506 South Furniture Furnishings 15.40 5 0.0 7.3920 Low Profit (profit > 0)
2332 Standard Class Consumer Lafayette Indiana 47905 Central Office Supplies Paper 277.40 5 0.0 133.1520 Low Profit (profit > 0)
2333 Standard Class Consumer Lafayette Indiana 47905 Central Office Supplies Paper 5.78 1 0.0 2.8322 Low Profit (profit > 0)
2573 Standard Class Home Office Lafayette Indiana 47905 Central Furniture Furnishings 526.45 5 0.0 31.5870 Low Profit (profit > 0)
3530 Standard Class Corporate Lafayette Indiana 47905 Central Office Supplies Art 21.40 5 0.0 6.2060 Low Profit (profit > 0)
3588 First Class Consumer Lafayette Louisiana 70506 South Office Supplies Supplies 1665.62 2 0.0 33.3124 Low Profit (profit > 0)
4029 Standard Class Consumer Lafayette Louisiana 70506 South Technology Machines 479.97 3 0.0 239.9850 Low Profit (profit > 0)
4030 Standard Class Consumer Lafayette Louisiana 70506 South Furniture Chairs 232.88 4 0.0 60.5488 Low Profit (profit > 0)
4361 Second Class Consumer Lafayette Louisiana 70506 South Technology Phones 29.16 3 0.0 8.4564 Low Profit (profit > 0)
4943 Standard Class Consumer Lafayette Indiana 47905 Central Office Supplies Storage 375.34 1 0.0 18.7670 Low Profit (profit > 0)
5120 Standard Class Corporate Lafayette Louisiana 70506 South Office Supplies Storage 232.55 5 0.0 9.3020 Low Profit (profit > 0)
5121 Standard Class Corporate Lafayette Louisiana 70506 South Technology Accessories 99.98 2 0.0 42.9914 Low Profit (profit > 0)
5122 Standard Class Corporate Lafayette Louisiana 70506 South Office Supplies Paper 19.44 3 0.0 9.3312 Low Profit (profit > 0)
5123 Standard Class Corporate Lafayette Louisiana 70506 South Office Supplies Paper 12.96 2 0.0 6.3504 Low Profit (profit > 0)
5719 Second Class Consumer Lafayette Indiana 47905 Central Office Supplies Storage 81.40 5 0.0 21.1640 Low Profit (profit > 0)
6401 First Class Consumer Lafayette Louisiana 70506 South Furniture Bookcases 241.96 2 0.0 33.8744 Low Profit (profit > 0)
6402 First Class Consumer Lafayette Louisiana 70506 South Office Supplies Binders 3.89 1 0.0 1.8672 Low Profit (profit > 0)
6403 First Class Consumer Lafayette Louisiana 70506 South Furniture Furnishings 8.01 3 0.0 3.0438 Low Profit (profit > 0)
6825 Standard Class Corporate Lafayette Indiana 47905 Central Office Supplies Storage 32.48 2 0.0 4.8720 Low Profit (profit > 0)
6826 Standard Class Corporate Lafayette Indiana 47905 Central Technology Copiers 17499.95 5 0.0 8399.9760 Very High Profit (> 2500)
6827 Standard Class Corporate Lafayette Indiana 47905 Central Office Supplies Binders 735.98 2 0.0 331.1910 Low Profit (profit > 0)
6828 Standard Class Corporate Lafayette Indiana 47905 Central Office Supplies Binders 34.37 7 0.0 16.8413 Low Profit (profit > 0)
6829 Standard Class Corporate Lafayette Indiana 47905 Central Office Supplies Art 33.96 2 0.0 9.5088 Low Profit (profit > 0)
8740 First Class Consumer Lafayette Louisiana 70506 South Office Supplies Art 73.20 5 0.0 21.2280 Low Profit (profit > 0)
9220 First Class Consumer Lafayette Louisiana 70506 South Office Supplies Labels 11.07 3 0.0 5.2029 Low Profit (profit > 0)
9221 First Class Consumer Lafayette Louisiana 70506 South Furniture Tables 1504.52 4 0.0 346.0396 Low Profit (profit > 0)
9222 First Class Consumer Lafayette Louisiana 70506 South Office Supplies Paper 25.92 4 0.0 12.4416 Low Profit (profit > 0)
9886 Standard Class Consumer Lafayette Indiana 47905 Central Office Supplies Fasteners 5.94 3 0.0 0.0000 Medium Loss (< 100)
9980 Second Class Consumer Lafayette Louisiana 70506 South Furniture Tables 85.98 1 0.0 22.3548 Low Profit (profit > 0)
In [ ]:
import matplotlib.pyplot as plt

# Calculate the count of products in each category
category_counts = df['Category'].value_counts()

# Create a pie chart for product distribution by category
plt.figure(figsize=(8, 8))
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Products by Category', fontsize=14)

# Show the pie chart
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
In [ ]:
# Calculate the count of jobs in each category
job_counts = df['Category'].value_counts()

# Create a bar chart for the distribution of jobs by category
plt.figure(figsize=(10, 6))
bars = plt.bar(job_counts.index, job_counts.values, color='skyblue')
plt.xlabel('Category', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Distribution of Jobs by Category', fontsize=14)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Annotate the bars with count values
for bar in bars:
    height = bar.get_height()
    plt.annotate(f'{int(height)}', xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3), textcoords='offset points', ha='center', va='bottom', fontsize=10)

# Show the bar chart
plt.tight_layout()
plt.show()
In [ ]:
import matplotlib.pyplot as plt

# Calculate the total profit for each category
category_profits = df.groupby('Category')['Profit'].sum()

# Create a pie chart for profit distribution by category
plt.figure(figsize=(8, 8))
plt.pie(category_profits, labels=category_profits.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Profit by Category', fontsize=14)

# Show the pie chart
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
In [ ]:
# Calculate the total profit for each product or sub-category
product_profits = df.groupby('Sub-Category')['Profit'].sum()

# Sort the products by profit in descending order
sorted_products = product_profits.sort_values(ascending=False)

# Create a bar chart for the distribution of profit by product or sub-category
plt.figure(figsize=(12, 6))
bars = plt.bar(sorted_products.index, sorted_products.values, color='skyblue')
plt.xlabel('Product/Sub-Category', fontsize=12)
plt.ylabel('Total Profit', fontsize=12)
plt.title('Distribution of Profit by Product/Sub-Category', fontsize=14)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Annotate the bars with profit values
for bar in bars:
    height = bar.get_height()
    plt.annotate(f'{height:.2f}', xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3), textcoords='offset points', ha='center', va='bottom', fontsize=10)

# Show the bar chart
plt.tight_layout()
plt.show()
In [ ]:
import matplotlib.pyplot as plt

# Calculate the total profit for each product or sub-category
product_profits = df.groupby('Sub-Category')['Profit'].sum()

# Sort the products by profit in descending order
sorted_products = product_profits.sort_values(ascending=False)

# Define a list of colors for the bars
bar_colors = ['skyblue' if profit > 0 else 'lightcoral' for profit in sorted_products.values]

# Create a bar chart for the distribution of profit by product or sub-category
plt.figure(figsize=(12, 6))
bars = plt.bar(sorted_products.index, sorted_products.values, color=bar_colors)
plt.xlabel('Product/Sub-Category', fontsize=12)
plt.ylabel('Total Profit', fontsize=12)
plt.title('Distribution of Profit by Product/Sub-Category', fontsize=14)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Annotate the bars with profit values
for bar in bars:
    height = bar.get_height()
    plt.annotate(f'{height:.2f}', xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3), textcoords='offset points', ha='center', va='bottom', fontsize=10)

# Show the bar chart
plt.tight_layout()
plt.show()
In [ ]:
import matplotlib.pyplot as plt

# Filter the DataFrame for profit and loss separately
profit_df = df[df['Profit'] > 0]
loss_df = df[df['Profit'] < 0]

# Calculate the total profit for each product or sub-category in the profit DataFrame
profit_profits = profit_df.groupby('Sub-Category')['Profit'].sum()

# Calculate the total loss for each product or sub-category in the loss DataFrame
loss_profits = loss_df.groupby('Sub-Category')['Profit'].sum()

# Sort the profit and loss products by profit in descending order
sorted_profit_products = profit_profits.sort_values(ascending=False)
sorted_loss_products = loss_profits.sort_values(ascending=True)

# Create a bar chart for the distribution of profit by product or sub-category (Profit)
plt.figure(figsize=(12, 6))
bars_profit = plt.bar(sorted_profit_products.index, sorted_profit_products.values, color='skyblue')
plt.xlabel('Product/Sub-Category', fontsize=12)
plt.ylabel('Total Profit', fontsize=12)
plt.title('Distribution of Profit by Product/Sub-Category (Profit)', fontsize=14)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Annotate the bars with profit values
for bar in bars_profit:
    height = bar.get_height()
    plt.annotate(f'{height:.2f}', xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3), textcoords='offset points', ha='center', va='bottom', fontsize=10)

# Create a bar chart for the distribution of loss by product or sub-category (Loss)
plt.figure(figsize=(12, 6))
bars_loss = plt.bar(sorted_loss_products.index, sorted_loss_products.values, color='lightcoral')
plt.xlabel('Product/Sub-Category', fontsize=12)
plt.ylabel('Total Loss', fontsize=12)
plt.title('Distribution of Loss by Product/Sub-Category (Loss)', fontsize=14)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Annotate the bars with loss values
for bar in bars_loss:
    height = bar.get_height()
    plt.annotate(f'{height:.2f}', xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3), textcoords='offset points', ha='center', va='bottom', fontsize=10)

# Show the bar charts separately
plt.tight_layout()
plt.show()
In [ ]:
import matplotlib.pyplot as plt
import numpy as np

# Filter the DataFrame for profit and loss separately
profit_df = df[df['Profit'] > 0]
loss_df = df[df['Profit'] < 0]

# Calculate the total profit for each product or sub-category in the profit DataFrame
profit_profits = profit_df.groupby('Sub-Category')['Profit'].sum()

# Calculate the total loss for each product or sub-category in the loss DataFrame
loss_profits = loss_df.groupby('Sub-Category')['Profit'].sum()

# Sort the profit and loss products by profit in descending order
sorted_profit_products = profit_profits.sort_values(ascending=False)
sorted_loss_products = loss_profits.sort_values(ascending=True)

# Create a figure with a single subplot
fig, ax = plt.subplots(figsize=(12, 8))

# Create horizontal bar charts for profit (right) and loss (left)
bars_profit = ax.barh(np.arange(len(sorted_profit_products)), sorted_profit_products.values, color='skyblue', label='Profit')
bars_loss = ax.barh(np.arange(len(sorted_loss_products)), sorted_loss_products.values, color='lightcoral', label='Loss')

# Set y-axis labels to the product/sub-category names
ax.set_yticks(np.arange(len(sorted_profit_products)))
ax.set_yticklabels(sorted_profit_products.index, fontsize=10)
ax.invert_yaxis()  # Invert the y-axis to have the highest value at the top
ax.set_xlabel('Total Profit / Loss', fontsize=12)
ax.set_title('Distribution of Profit and Loss by Product/Sub-Category', fontsize=14)
ax.grid(axis='x', linestyle='--', alpha=0.7)

# Annotate the bars with profit and loss values
for bar in bars_profit:
    width = bar.get_width()
    ax.annotate(f'{width:.2f}', xy=(width, bar.get_y() + bar.get_height() / 2),
                xytext=(3, 0), textcoords='offset points', ha='left', va='center', fontsize=10)

for bar in bars_loss:
    width = bar.get_width()
    ax.annotate(f'{width:.2f}', xy=(width, bar.get_y() + bar.get_height() / 2),
                xytext=(-12, 0), textcoords='offset points', ha='right', va='center', fontsize=10)

# Add a legend
ax.legend(loc='upper right')

# Show the combined plot
plt.tight_layout()
plt.show()
In [ ]:
import matplotlib.pyplot as plt
import numpy as np

# Calculate the total profit for each segment
segment_profits = df.groupby('Segment')['Profit'].sum()

# Calculate the total loss for each segment
segment_losses = df[df['Profit'] < 0].groupby('Segment')['Profit'].sum()

# Sort the segments by profit in descending order
sorted_segment_profits = segment_profits.sort_values(ascending=False)
sorted_segment_losses = segment_losses.sort_values(ascending=True)

# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Create horizontal bar chart for profit by segment (left subplot)
bars_segment_profit = ax1.barh(np.arange(len(sorted_segment_profits)), sorted_segment_profits.values, color='skyblue', label='Profit')
ax1.set_yticks(np.arange(len(sorted_segment_profits)))
ax1.set_yticklabels(sorted_segment_profits.index, fontsize=12)
ax1.invert_yaxis()  # Invert the y-axis to have the highest value at the top
ax1.set_xlabel('Total Profit', fontsize=12)
ax1.set_title('Distribution of Profit by Segment', fontsize=14)
ax1.grid(axis='x', linestyle='--', alpha=0.7)

# Annotate the bars with profit values
for bar in bars_segment_profit:
    width = bar.get_width()
    ax1.annotate(f'{width:.2f}', xy=(width, bar.get_y() + bar.get_height() / 2),
                xytext=(3, 0), textcoords='offset points', ha='left', va='center', fontsize=12)

# Create a pie chart for segment distribution (right subplot)
segment_distribution = df['Segment'].value_counts()
ax2.pie(segment_distribution, labels=segment_distribution.index, autopct='%1.1f%%', startangle=140, colors=['lightcoral', 'lightgreen', 'lightblue'])
ax2.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
ax2.set_title('Segment Distribution', fontsize=14)

# Show the plots
plt.tight_layout()
plt.show()
In [ ]:
import matplotlib.pyplot as plt
import numpy as np

# Calculate the total profit for each segment
segment_profits = df.groupby('Segment')['Profit'].sum()

# Calculate the total loss for each segment
segment_losses = df[df['Profit'] < 0].groupby('Segment')['Profit'].sum()

# Sort the segments by profit and loss in ascending order
sorted_segment_profits = segment_profits.sort_values(ascending=False)
sorted_segment_losses = segment_losses.sort_values(ascending=True)

# Create a figure with a single subplot
fig, ax = plt.subplots(figsize=(12, 6))

# Create horizontal bar chart for profit (right) and loss (left)
bar_width = 0.4
index = np.arange(len(sorted_segment_profits))
bars_profit = ax.barh(index, sorted_segment_profits.values, bar_width, color='skyblue', label='Profit')
bars_loss = ax.barh(index, sorted_segment_losses.values, bar_width, color='lightcoral', label='Loss', left=sorted_segment_profits.values)

# Set y-axis labels to the segment names
ax.set_yticks(index)
ax.set_yticklabels(sorted_segment_profits.index, fontsize=12)
ax.invert_yaxis()  # Invert the y-axis to have the highest value at the top
ax.set_xlabel('Total Profit / Loss', fontsize=12)
ax.set_title('Distribution of Profit and Loss by Segment', fontsize=14)
ax.grid(axis='x', linestyle='--', alpha=0.7)

# Annotate the bars with profit and loss values
for bar in bars_profit:
    width = bar.get_width()
    ax.annotate(f'{width:.2f}', xy=(width, bar.get_y() + bar_height / 2),
                xytext=(3, 0), textcoords='offset points', ha='left', va='center', fontsize=12)

for bar in bars_loss:
    width = bar.get_width()
    ax.annotate(f'{abs(width):.2f}', xy=(width, bar.get_y() + bar_height / 2),
                xytext=(-12, 0), textcoords='offset points', ha='right', va='center', fontsize=12)

# Add a legend
ax.legend(loc='upper right')

# Show the plot
plt.tight_layout()
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
c:\Users\Irshad Yasseen A\Documents\Jupy\Untitled Folder\intern\Sparks\task2\Retail_EDA.ipynb Cell 64 line 3
     <a href='vscode-notebook-cell:/c%3A/Users/Irshad%20Yasseen%20A/Documents/Jupy/Untitled%20Folder/intern/Sparks/task2/Retail_EDA.ipynb#Y230sZmlsZQ%3D%3D?line=31'>32</a> for bar in bars_profit:
     <a href='vscode-notebook-cell:/c%3A/Users/Irshad%20Yasseen%20A/Documents/Jupy/Untitled%20Folder/intern/Sparks/task2/Retail_EDA.ipynb#Y230sZmlsZQ%3D%3D?line=32'>33</a>     width = bar.get_width()
---> <a href='vscode-notebook-cell:/c%3A/Users/Irshad%20Yasseen%20A/Documents/Jupy/Untitled%20Folder/intern/Sparks/task2/Retail_EDA.ipynb#Y230sZmlsZQ%3D%3D?line=33'>34</a>     ax.annotate(f'{width:.2f}', xy=(width, bar.get_y() + bar_height / 2),
     <a href='vscode-notebook-cell:/c%3A/Users/Irshad%20Yasseen%20A/Documents/Jupy/Untitled%20Folder/intern/Sparks/task2/Retail_EDA.ipynb#Y230sZmlsZQ%3D%3D?line=34'>35</a>                 xytext=(3, 0), textcoords='offset points', ha='left', va='center', fontsize=12)
     <a href='vscode-notebook-cell:/c%3A/Users/Irshad%20Yasseen%20A/Documents/Jupy/Untitled%20Folder/intern/Sparks/task2/Retail_EDA.ipynb#Y230sZmlsZQ%3D%3D?line=36'>37</a> for bar in bars_loss:
     <a href='vscode-notebook-cell:/c%3A/Users/Irshad%20Yasseen%20A/Documents/Jupy/Untitled%20Folder/intern/Sparks/task2/Retail_EDA.ipynb#Y230sZmlsZQ%3D%3D?line=37'>38</a>     width = bar.get_width()

NameError: name 'bar_height' is not defined
In [ ]:
import matplotlib.pyplot as plt

# Filter the DataFrame for profit and loss separately
profit_df = df[df['Profit'] > 0]
loss_df = df[df['Profit'] < 0]

# Calculate the total profit for each segment
profit_segment_profits = profit_df.groupby('Segment')['Profit'].sum()

# Calculate the total loss for each segment
loss_segment_losses = loss_df.groupby('Segment')['Profit'].sum().abs()

# Create two subplots for profit and loss
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

# Create a pie chart for profit by segment
ax1.pie(profit_segment_profits, labels=profit_segment_profits.index, autopct='%1.1f%%', startangle=140,
        colors=['lightcoral', 'lightgreen', 'lightblue'])
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
ax1.set_title('Profit Distribution by Segment', fontsize=14)

# Create a pie chart for loss by segment
ax2.pie(loss_segment_losses, labels=loss_segment_losses.index, autopct='%1.1f%%', startangle=140,
        colors=['lightcoral', 'lightgreen', 'lightblue'])
ax2.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
ax2.set_title('Loss Distribution by Segment', fontsize=14)

# Show the pie charts
plt.tight_layout()
plt.show()